import time
start_time = time.time()
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import math
from pandas import Series,DataFrame
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn import metrics
import seaborn as sns
pd.options.display.max_columns = 500
pd.options.display.max_rows = 50
# Import the CSV
df = pd.read_csv("/Users/datasci/Python/CiscoAML/old/dataset-comparison/NIDS-AML-Baselines/NIDS-AML-Baselines-queen.csv", header=0)
# What does this dataframe look like?
df.head()
These are baselines samples. They are thus all labelled benign.
df['Label'] = 0
df.head()
General data exploration of the sample
# Plot the amount of traffic from Source IP addresses
plt.figure(figsize=(200,10))
plot = sns.countplot(x="Src IP",data=df)
plot.set_xticklabels(plot.get_xticklabels(), rotation=40, ha="right", fontsize=40)
plt.tight_layout()
plt.show()
# Plot the amount of traffic from Destination IP addresses
plt.figure(figsize=(240,15))
plot = sns.countplot(x="Dst IP",data=df)
plot.set_xticklabels(plot.get_xticklabels(), rotation=40, ha="right", fontsize=40)
plt.tight_layout()
plt.show()
# Plot the amount of traffic over Protocols
plt.figure(figsize=(10,4))
plot = sns.countplot(x="Protocol",data=df)
plot.set_xticklabels(plot.get_xticklabels(), rotation=40, ha="right")
plt.tight_layout()
plt.show()
sp_df = df['Src Port'].copy()
values = sp_df.value_counts(sort=True)
values_df = pd.DataFrame(values)
values_df = values_df.reset_index()
values_df.columns = ['port','count']
values_df
i = sns.set(style='ticks')
i = sns.relplot(x="port", y="count", data=values_df, aspect=3)
i.set_axis_labels('source port').set(xticks=[1024, 49000, 65535])
plt.show(i)
dp_df = df['Dst Port'].copy()
values = dp_df.value_counts(sort=True)
values_df = pd.DataFrame(values)
values_df = values_df.reset_index()
values_df.columns = ['port','count']
values_df
less_than_1024 = values_df[values_df.port < 1023]
less_than_1024
j = sns.set(style='ticks')
j = sns.relplot(x="port", y="count", data=less_than_1024, aspect=3)
#j.set_axis_labels('port').set(xticks=[0,1024])
plt.show(j)
print("--- %s seconds ---" % (time.time() - start_time))
# Save the csv with the labels
df.to_csv("NIDS-AML-Baselines-queen-labeled.csv", encoding='utf-8', index=False)